/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

#ifndef MX__REQUESTS_H
#define MX__REQUESTS_H

#include "mx__valgrind.h"
#include "mcp_config.h"
#include "mx__driver_interface.h"
#include "mx__endpoint.h"
#include "mx__wire.h"
#include "mx__segment.h"
#include "mx_byteswap.h"
#include "mx_stbar.h"
#include "mx_cpu.h"

/*************************
 * Generic copy routines *
 *************************/
static inline void 
mx_copy64_inline(void *to, uint64_t *from64, unsigned size, int fence)
{
  register volatile uint64_t *to64;
  uint64_t tmp;
  int i;

  mx_assert ((size > 0)
             && !(size & 7) && !((uintptr_t)to & 7) 
             && !((uintptr_t)from64 & 7));

  to64 = (volatile uint64_t *) to;
  for (i = size / 8 - 1; i; i--)
    {
      *to64++ = *from64++;
    }
  tmp = *from64;
  if (fence)
    MX_STBAR();
  *to64 = tmp;
  MX_STBAR();
}

static inline void 
mx_copy32_inline(void *to, uint32_t *from32, unsigned size)
{
  register volatile uint32_t *to32;
  uint32_t tmp;
  int i;

  mx_assert ((size > 0)
	     && !(size & 3) && !((uintptr_t)to & 3) 
	     && !((uintptr_t)from32 & 3));

  to32 = (volatile uint32_t *) to;
  for (i = size / 4 - 1; i; i--)
    {
      *to32++ = *from32++;
    }
  tmp = *from32;
  MX_STBAR();
  *to32 = tmp;
  MX_STBAR();
}

/******************************
 * Various ureq copy routines *
 ******************************/
#if MX_COPY_WITH_SSE2
static inline void
mx_copy_ureq_with_sse2(void *to, uint64_t *from, int fence)
{
  if (fence) {
    __asm__ __volatile__(
		       "movdqa    (%1),%%xmm0\n\t"
		       "movdqa  16(%1),%%xmm1\n\t"
		       "movdqa  32(%1),%%xmm2\n\t"
		       "movdqa  48(%1),%%xmm3\n\t"
		       "movdqa  %%xmm0,   (%0)\n\t"
		       "movdqa  %%xmm1, 16(%0)\n\t"
		       "movdqa  %%xmm2, 32(%0)\n\t"
		       "movq  %%xmm3,  48(%0)\n\t"
		       "sfence\n\t"
		       "psrldq $8, %%xmm3 \n\t"
		       "movq  %%xmm3,  56(%0)\n\t"
		       "sfence\n\t"
		       ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
  } else {
    __asm__ __volatile__(
		       "movdqa    (%1),%%xmm0\n\t"
		       "movdqa  16(%1),%%xmm1\n\t"
		       "movdqa  32(%1),%%xmm2\n\t"
		       "movdqa  48(%1),%%xmm3\n\t"
		       "sfence\n\t"
		       "movdqa  %%xmm0,   (%0)\n\t"
		       "movdqa  %%xmm1, 16(%0)\n\t"
       		       "movdqa  %%xmm2, 32(%0)\n\t"
       		       "movdqa  %%xmm3, 48(%0)\n\t"
       		       "sfence\n\t"
		       ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
  }
}
#endif /* MX_COPY_WITH_SSE2 */

#if MX_COPY_WITH_ALTIVEC
static inline void
mx_copy_ureq_with_altivec(void *to, uint64_t *from)
{
  __asm__ __volatile__(
	"	sync\n"
	LVX	"0,  0, %4\n"
	LVX	"1,  0, %5\n"
	LVX	"2,  0, %6\n"
	LVX	"3,  0, %7\n"
	STVX	"0,  0, %0\n"
	STVX	"1,  0, %1\n"
	STVX	"2,  0, %2\n"
	STVX	"3,  0, %3\n"
	"	sync\n"
	:
	: "b"(to), 
	  "b"(((char*)to)+16), 
	  "b"(((char*)to)+32), 
	  "b"(((char*)to)+48), 
	  "b"(from), 
	  "b"(((char*)from)+16),
	  "b"(((char*)from)+32),
	  "b"(((char*)from)+48)
        );
}
#endif /* MX_COPY_WITH_ALTIVEC */

#if MX_COPY_WITH_PPC_ASM
static inline void
mx_copy_ureq_with_ppc_asm(void *to, uint64_t *from)
{
  __asm__ __volatile__(
	"	sync\n"
	LFD	"0,  0(%1)\n"
	LFD	"1,  8(%1)\n"
	LFD	"2, 16(%1)\n"
	LFD	"3, 24(%1)\n"
	LFD	"4, 32(%1)\n"
	LFD	"5, 40(%1)\n"
	LFD	"6, 48(%1)\n"
	LFD	"7, 56(%1)\n"
	STFD	"0,  0(%0)\n"
	STFD	"1,  8(%0)\n"
	STFD	"2, 16(%0)\n"
	STFD	"3, 24(%0)\n"
	STFD	"4, 32(%0)\n"
	STFD	"5, 40(%0)\n"
	STFD	"6, 48(%0)\n"
	STFD	"7, 56(%0)\n"
	"	sync\n"
	:
	: "b"(to), "b"(from)
        );
}
#endif /* MX_COPY_WITH_PPC_ASM */

static inline void
mx_copy_ureq_without_fence(void *to, void *from)
{
#if MX_COPY_WITH_SSE2
  mx_copy_ureq_with_sse2(to, from, 0);
#elif MX_COPY_WITH_ALTIVEC
  mx_copy_ureq_with_altivec(to, from);
#elif MX_COPY_WITH_PPC_ASM
  mx_copy_ureq_with_ppc_asm(to, from);
#else
  mx_copy64_inline(to, from, sizeof (mcp_ureq_t), 0);
#endif
}

static inline void
mx_copy_ureq_with_type_later(void *to, void *from)
{
#define TYPE32_POSITION_IN_UREQ sizeof(mcp_ureq_t)/4-1
  uint32_t type32 = ((uint32_t*)from)[TYPE32_POSITION_IN_UREQ];
  ((uint32_t*)from)[TYPE32_POSITION_IN_UREQ] = 0;
  mx_copy_ureq_without_fence(to, from);
  ((uint32_t*)to)[TYPE32_POSITION_IN_UREQ] = type32;
  MX_STBAR();
  /* restore the original type since the request might be reposted
   * later (especially with mx__repost_ureq_tiny()). */
  ((uint32_t*)from)[TYPE32_POSITION_IN_UREQ] = type32;
}

/***************************
 * Main ureq copy routines *
 ***************************/
static inline void
mx_copy_ureq(int ze, void *to, void *from, mcp_ureq_type_t type)
{
  if (ze) {
    mx_assert(((uintptr_t)to & 0xf00) == 0);
    to = (char *)to + (type << 8);
    if (MX_OS_UDRV && Mx_endpoints->lxgdb) {
      mx_write_pio_req_t x;
      x.offset = (uint32_t)((uintptr_t)to & (MX_MCP_VPAGE_SIZE - 1));
      mx_always_assert((x.offset & 63) == 0);
      x.len = 64;
      memcpy(x.data, from, sizeof(mcp_ureq_t));
      /* everything is allowed with udrv, even something as bad as
	 getting the current endpoint with Mx_endpoints :-) */
      mx__write_pio_req(Mx_endpoints->handle, &x);
    } else {
      mx_copy_ureq_without_fence(to, from);
    }
    return;
  }
  if (mx__opt.intel_cpu) {
    mx_copy_ureq_with_type_later(to, from);
    return;
  }
#if MX_COPY_WITH_SSE2
  mx_copy_ureq_with_sse2(to, from, mx__opt.wc_fence);
#elif MX_COPY_WITH_ALTIVEC
  mx_copy_ureq_with_altivec(to, from);
#elif MX_COPY_WITH_PPC_ASM
  mx_copy_ureq_with_ppc_asm(to, from);
#else
  if (sizeof (void *) == 8) {
    mx_copy64_inline(to, from, sizeof (mcp_ureq_t), 1);
  } else {
    mx_copy32_inline(to, from, sizeof (mcp_ureq_t));
  }
#endif
}

/*************************
 * ureq posting routines *
 *************************/

/* Be cache friendly */
#define MCP_UREQ_ALIGNMENT 64

/* Altivec _requires_ 16-bytes aligned request, SSE2 is faster when aligned. */
#if (MX_COPY_WITH_SSE2 || MX_COPY_WITH_ALTIVEC) && (MCP_UREQ_ALIGNMENT % 16 || !MCP_UREQ_ALIGNMENT)
#error UREQ batch must be aligned on 16bytes for Altivec or SSE
#endif

#define ALIGN_MCP_UREQ(buffer) (void*)(((uintptr_t)buffer+MCP_UREQ_ALIGNMENT-1) & ~(MCP_UREQ_ALIGNMENT-1))
#define ALIGNED_MCP_UREQ(name) \
  char name##_buffer[sizeof(mcp_ureq_t) + MCP_UREQ_ALIGNMENT]; \
  mcp_ureq_t *name = ALIGN_MCP_UREQ(&name##_buffer)

static inline void
mx__post_ureq_tiny(int ze, mcp_ureq_t *req,
		   struct mx__partner *partner, uint64_t match_info,
		   uint32_t length, uint16_t seqnum, uint16_t cookie, 
		   mx_segment_t * segs, uint32_t count, uintptr_t memory_context,
		   void *shadow)
{
  mcp_ureq_t *batch = ALIGN_MCP_UREQ(shadow);
  uint32_t match_a, match_b;

  match_a = (uint32_t) (match_info >> 32);
  match_b = (uint32_t) (match_info & 0xffffffff);

  batch->tiny.dest_endpt = partner->eid;
  batch->tiny.dest_peer_index = partner->peer_index_n;
  batch->tiny.length = htons(length);
  batch->tiny.lib_cookie = htons(cookie);
  batch->tiny.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  batch->tiny.lib_piggyack = htons(partner->fully_recv_seq);
  batch->tiny.match_a = htonl(match_a);
  batch->tiny.match_b = htonl(match_b);
  batch->tiny.session = partner->endpoint_sid_n;
  batch->tiny.type = MX_MCP_UREQ_SEND_TINY;

  if (likely(length)) {
    if (likely(count == 1))
      mx_memcpy_from_segment(batch->tiny.data, segs[0].segment_ptr,
			     length, memory_context);
    else
      mx__copy_from_segments((char*) batch->tiny.data, segs, count,
			     memory_context, 0, length);
  }

  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->tiny.pad0, sizeof(batch->tiny.pad0));

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_TINY);
}

/* repost the request that has been prepared in the shadow,
 * only update the piggyback and cookie fields */
static inline void
mx__repost_ureq_tiny(int ze, mcp_ureq_t *req, struct mx__partner *partner,
		     uint16_t cookie, void *shadow)
{
  mcp_ureq_t *batch = ALIGN_MCP_UREQ(shadow);

  batch->tiny.lib_cookie = htons(cookie);
  partner->recv_acked = partner->fully_recv_seq;
  batch->tiny.lib_piggyack = htons(partner->fully_recv_seq);

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_TINY);
}

static inline void
mx__post_ureq_small(int ze, mcp_ureq_t *req,
		    struct mx__partner *partner, uint64_t match_info,
		    uint32_t length, uint16_t seqnum, uint16_t cookie, 
		    uint16_t offset)
{
  ALIGNED_MCP_UREQ(batch);
  uint32_t match_a, match_b;
  
  match_a = (uint32_t) (match_info >> 32);
  match_b = (uint32_t) (match_info & 0xffffffff);

  batch->small.dest_endpt = partner->eid;
  batch->small.dest_peer_index = partner->peer_index_n;
  batch->small.length = htons(length);
  batch->small.lib_cookie = htons(cookie);
  batch->small.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  batch->small.lib_piggyack = htons(partner->fully_recv_seq);
  batch->small.match_a = htonl(match_a);
  batch->small.match_b = htonl(match_b);
  batch->small.session = partner->endpoint_sid_n;
  batch->small.offset = htons(offset);
  batch->small.type = MX_MCP_UREQ_SEND_SMALL;

  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->small.pad0, sizeof(batch->small.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->small.pad1, sizeof(batch->small.pad1));
  
  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_SMALL);
}

static inline void
mx__post_ureq_medium(int ze, mcp_ureq_t *req,
		     struct mx__partner *partner, uint64_t match_info,
		     uint32_t length, uint16_t seqnum, uint16_t cookie, 
		     uint16_t sendq_index, uint8_t credits, uint8_t pipeline)
{
  ALIGNED_MCP_UREQ(batch);
  uint32_t match_a, match_b;
  
  match_a = (uint32_t) (match_info >> 32);
  match_b = (uint32_t) (match_info & 0xffffffff);

  batch->medium.dest_endpt = partner->eid;
  batch->medium.dest_peer_index = partner->peer_index_n;
  batch->medium.length = htons(length);
  batch->medium.lib_cookie = htons(cookie);
  batch->medium.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  batch->medium.lib_piggyack = htons(partner->fully_recv_seq);
  batch->medium.match_a = htonl(match_a);
  batch->medium.match_b = htonl(match_b);
  batch->medium.session = partner->endpoint_sid_n;
  batch->medium.sendq_index = htons(sendq_index);
  batch->medium.credits = credits;
  batch->medium.pipeline = pipeline;
  batch->medium.type = MX_MCP_UREQ_SEND_MEDIUM;

  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->medium.pad0, sizeof(batch->medium.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->medium.pad1, sizeof(batch->medium.pad1));
  
  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_MEDIUM);
}

static inline void
mx__post_ureq_medium_cont_ze(mcp_ureq_t *req)
{
  ALIGNED_MCP_UREQ(batch);
  
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch, sizeof(batch));

  mx_copy_ureq(1, req, batch->int_array, MX_MCP_UREQ_SEND_MEDIUM_CONT);
}

static inline void
mx__post_ureq_rndv(int ze, mcp_ureq_t *req,
		   struct mx__partner *partner, uint64_t match_info,
		   uint32_t msg_length, uint16_t seqnum, uint16_t cookie, 
		   uint8_t rdma_id, uint8_t rdma_seqnum, uint16_t offset)
{
  ALIGNED_MCP_UREQ(batch);
  uint32_t match_a, match_b;
  
  match_a = (uint32_t) (match_info >> 32);
  match_b = (uint32_t) (match_info & 0xffffffff);

  batch->tiny.dest_endpt = partner->eid;
  batch->tiny.dest_peer_index = partner->peer_index_n;
  batch->tiny.length = htons(8);
  batch->tiny.lib_cookie = htons(cookie);
  batch->tiny.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  batch->tiny.lib_piggyack = htons(partner->fully_recv_seq);
  batch->tiny.match_a = htonl(match_a);
  batch->tiny.match_b = htonl(match_b);
  batch->tiny.session = partner->endpoint_sid_n;
  *(uint32_t *) &(batch->tiny.data[0]) = htonl(msg_length);
  *(uint8_t *) &(batch->tiny.data[4]) = rdma_id;
  *(uint8_t *) &(batch->tiny.data[5]) = rdma_seqnum;
  *(uint16_t *) &(batch->tiny.data[6]) = htons(offset);
  batch->tiny.type = MX_MCP_UREQ_SEND_RNDV;

  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->tiny.pad0, sizeof(batch->tiny.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->tiny.data[8], sizeof(batch->tiny.data)-8);

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_RNDV);
}

static inline void
mx__post_ureq_pull(int ze, mcp_ureq_t *req,
		   struct mx__partner *partner, union mx_request *r, 
		   uint8_t origin_rdma_id, uint8_t origin_rdma_seqnum, 
		   uint16_t origin_rdma_offset, uint16_t cookie)
{
  ALIGNED_MCP_UREQ(batch);
  
  batch->pull.dest_endpt = partner->eid;
  batch->pull.dest_peer_index = partner->peer_index_n;
  batch->pull.session = partner->connect_session_n;
  batch->pull.length = htonl(r->recv.basic.status.xfer_length);
  *((uint32_t *) &(batch->pull.target_rdmawin_id)) = r->recv.remote_rdma;
  batch->pull.origin_rdmawin_id = origin_rdma_id;
  batch->pull.origin_rdmawin_seqnum = origin_rdma_seqnum;
  batch->pull.origin_rdma_offset = htons(origin_rdma_offset);
  batch->pull.lib_cookie = htons(cookie);
  batch->pull.type = MX_MCP_UREQ_PULL;
  
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->pull.pad0, sizeof(batch->pull.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->pull.pad1, sizeof(batch->pull.pad1));

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_PULL);
}

static inline void
mx__post_ureq_notify(int ze, mcp_ureq_t *req, struct mx__partner * partner, 
		     union mx_request *r, uint16_t seqnum, uint16_t cookie)
{
  ALIGNED_MCP_UREQ(batch);
  
  batch->notify.dest_endpt = partner->eid;
  batch->notify.dest_peer_index = partner->peer_index_n;
  batch->notify.session = partner->connect_session_n;
  batch->notify.length = htonl(r->recv.basic.status.xfer_length);
  *((uint32_t *) &(batch->notify.target_rdmawin_id)) = r->recv.remote_rdma;
  batch->notify.lib_cookie = htons(cookie);
  batch->notify.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  batch->notify.lib_piggyack = htons(partner->fully_recv_seq);
  batch->notify.type = MX_MCP_UREQ_SEND_NOTIFY;
  
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->notify.pad0, sizeof(batch->notify.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->notify.pad1, sizeof(batch->notify.pad1));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->notify.pad2, sizeof(batch->notify.pad2));

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_NOTIFY);
}

static inline void
mx__post_ureq_connect(int ze, mcp_ureq_t *req, union mx_request *r, 
		      uint16_t cookie, uint8_t is_reply)
{
  ALIGNED_MCP_UREQ(batch);
  struct mx__connect_data *connect = MX__CONNECT_DATA(&batch->connect);
  batch->connect.dest_endpt = r->connect.peer_endpoint_id;
  
  batch->connect.dest_peer_index = r->connect.peer_index_n;
  batch->connect.length = 16;
  batch->connect.lib_seqnum = htons(0);
  batch->connect.lib_cookie = htons(cookie);
  batch->connect.dest_peer_index = r->connect.peer_index_n;
  
  connect->dest_session_n = r->connect.dest_session_n;
  connect->app_key_n = r->connect.app_key_n;
  connect->seqnum_start_n = htons(r->connect.seqnum_start);
  connect->is_reply = is_reply;
  connect->connect_seqnum_n = r->connect.connect_seqnum_n;
  connect->status_code = r->connect.status_code_n;
  batch->connect.type = MX_MCP_UREQ_SEND_CONNECT;

  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->connect.pad0, sizeof(batch->connect.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->connect.pad1, sizeof(batch->connect.pad1));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->connect.pad2, sizeof(batch->connect.pad2));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->connect.pad3, sizeof(batch->connect.pad3));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->connect.data, sizeof(batch->connect.data));

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_CONNECT);
}

static inline void
mx__post_ureq_truc(int ze, mcp_ureq_t *req, mcp_ureq_t *batch, uint16_t peer_n,
		   uint8_t endpt, uint32_t session, uint16_t cookie)
{
  batch->truc.dest_endpt = endpt;
  batch->truc.dest_peer_index = peer_n;
  batch->truc.length = sizeof(union mx__lib2lib);
  batch->truc.lib_cookie = htons(cookie);
  batch->truc.session = session;
  batch->truc.type = MX_MCP_UREQ_SEND_TRUC;
  
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->truc.pad0, sizeof(batch->truc.pad0));
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->truc.pad1, sizeof(batch->truc.pad1));
  
  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_SEND_TRUC);
}

static inline void 
mx__post_ureq_wake(int ze, mcp_ureq_t *req, uint32_t eventq_flow)
{
  ALIGNED_MCP_UREQ(batch);

  batch->wake.eventq_flow = htonl(eventq_flow);
  batch->wake.type = MX_MCP_UREQ_WAKE;
  
  MX_VALGRIND_MEMORY_MAKE_READABLE(&batch->wake.pad, sizeof(batch->wake.pad));

  mx_copy_ureq(ze, req, batch->int_array, MX_MCP_UREQ_WAKE);
}

#endif

/***************************************
 * PIO small messages, up to 128 bytes *
 * length is a multiple of 32          *
 ***************************************/
#if MX_COPY_WITH_SSE2
static inline void
mx_copy_small_with_sse2(void *to, uint64_t *from, int size, int fence)
{
  if (fence) {
    if (size == 64)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else if (size == 96)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  64(%1),%%xmm4\n\t"
			   "movdqu  80(%1),%%xmm5\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm4, 64(%0)\n\t"
			   "movdqa  %%xmm5, 80(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else if (size == 128)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  64(%1),%%xmm4\n\t"
			   "movdqu  80(%1),%%xmm5\n\t"
			   "movdqu  96(%1),%%xmm6\n\t"
			   "movdqu 112(%1),%%xmm7\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm4, 64(%0)\n\t"
			   "movdqa  %%xmm5, 80(%0)\n\t"
			   "sfence\n\t"
			   "movdqa  %%xmm6, 96(%0)\n\t"
			   "movdqa  %%xmm7,112(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else
      mx_always_assert(0);
  } else {
    if (size == 64)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else if (size == 96)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  64(%1),%%xmm4\n\t"
			   "movdqu  80(%1),%%xmm5\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "movdqa  %%xmm4, 64(%0)\n\t"
			   "movdqa  %%xmm5, 80(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else if (size == 128)
      __asm__ __volatile__(
			   "movdqu    (%1),%%xmm0\n\t"
			   "movdqu  16(%1),%%xmm1\n\t"
			   "movdqu  32(%1),%%xmm2\n\t"
			   "movdqu  48(%1),%%xmm3\n\t"
			   "movdqu  64(%1),%%xmm4\n\t"
			   "movdqu  80(%1),%%xmm5\n\t"
			   "movdqu  96(%1),%%xmm6\n\t"
			   "movdqu 112(%1),%%xmm7\n\t"
			   "movdqu  %%xmm0,   (%0)\n\t"
			   "movdqu  %%xmm1, 16(%0)\n\t"
			   "movdqa  %%xmm2, 32(%0)\n\t"
			   "movdqa  %%xmm3, 48(%0)\n\t"
			   "movdqa  %%xmm4, 64(%0)\n\t"
			   "movdqa  %%xmm5, 80(%0)\n\t"
			   "movdqa  %%xmm6, 96(%0)\n\t"
			   "movdqa  %%xmm7,112(%0)\n\t"
			   "sfence\n\t"
			   ::"a"(to), "d"(from) : "memory" MX_ASM_XMM_DEP);
    else
      mx_always_assert(0);
  }
}
#endif /* MX_COPY_WITH_SSE2 */

static inline void
mx_zmemcpy_128(void *top, void *fromp, int size, int sfence)
{
#if MX_COPY_WITH_SSE2
  mx_copy_small_with_sse2(top, fromp, size, sfence);
#else
  /* MX_COPY_WITH_ALTIVEC case (needs alignment) ? MX_COPY_WITH_PPC_ASM case ? */
  uintptr_t to = (uintptr_t)top;
  uintptr_t from = (uintptr_t)fromp;
  int i;
  mx_assert((~(to | size) & 31) && ~(from & 7));
  for (i = 0;i < size; i+= 32) {
    uint64_t a, b;
    a = *(uint64_t*)(from + i + 0);
    b = *(uint64_t*)(from + i + 8);
    *(uint64_t*)(to + i + 0) = a;
    *(uint64_t*)(to + i + 8) = b;
    a = *(uint64_t*)(from + i + 16);
    b = *(uint64_t*)(from + i + 24);
    *(uint64_t*)(to + i + 16) = a;
    *(uint64_t*)(to + i + 24) = b;
    if (sfence)
      MX_STBAR();
  }
#endif
}


#if (MX_CPU_powerpc64 || MX_CPU_powerpc) && defined(__GNUC__)
static void inline 
mx_dcbf(void *p, unsigned len)
{
  int i;
  uintptr_t addr = (uintptr_t)p;
  unsigned align = addr & 127;

  addr -= align;
  len += align;

  /* if not align, don't flush last line in case we need the remainder just afterwards */
  for (i=0;i + 127 < len; i += 128)
    {
      asm volatile("dcbf 0,%0" : : "r" (addr + i));
    }

}
#else
#define mx_dcbf(p, len)
#endif
